second setp: Exploratory data analysis
By: Polly Pang
In this portion of the notebook, I will process the Exploratory data analysis, with will give more information and insights of the dataset. Additionally, I will are merge the dataset to preprocess for the next NLP(Natural language processing) part.
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
import plotly.express as px
import plotly.subplots as sp
import joblib as joblib
# load data
fake_job_df = joblib.load('data/fake_job_cleaned.pkl')
# check df head
fake_job_df.head()
| title | department | has_salary_range | company_profile | description | requirements | benefits | telecommuting | has_company_logo | has_questions | employment_type | required_experience | required_education | industry | function | fraudulent | country | province | city | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Marketing Intern | Marketing | 0 | We're Food52, and we've created a groundbreaki... | Food52, a fast-growing, James Beard Award-winn... | Experience with content management systems a m... | Blank | 0 | 1 | 0 | Other | Internship | Blank | Blank | Marketing | 0 | US | NY | New York |
| 2 | Commissioning Machinery Assistant (CMA) | Blank | 0 | Valor Services provides Workforce Solutions th... | Our client, located in Houston, is actively se... | Implement pre-commissioning and commissioning ... | Blank | 0 | 1 | 0 | Blank | Blank | Blank | Blank | Blank | 0 | US | IA | Wever |
| 3 | Account Executive - Washington DC | Sales | 0 | Our passion for improving quality of life thro... | THE COMPANY: ESRI – Environmental Systems Rese... | EDUCATION: Bachelor’s or Master’s in GIS, busi... | Our culture is anything but corporate—we have ... | 0 | 1 | 0 | Full-time | Mid-Senior level | Bachelor's Degree | Computer Software | Sales | 0 | US | DC | Washington |
| 4 | Bill Review Manager | Blank | 0 | SpotSource Solutions LLC is a Global Human Cap... | JOB TITLE: Itemization Review ManagerLOCATION:... | QUALIFICATIONS:RN license in the State of Texa... | Full Benefits Offered | 0 | 1 | 1 | Full-time | Mid-Senior level | Bachelor's Degree | Hospital & Health Care | Health Care Provider | 0 | US | FL | Fort Worth |
| 5 | Accounting Clerk | Blank | 0 | Blank | Job OverviewApex is an environmental consultin... | Blank | Blank | 0 | 0 | 0 | Blank | Blank | Blank | Blank | Blank | 0 | US | MD | Blank |
fraudulent# plot distribution of Fraud and True job post
target_count=fake_job_df['fraudulent'].value_counts().to_frame()
fig = px.bar(target_count, x=target_count.index, y=target_count['fraudulent'], title='Count Fraud VS Real',
color=target_count['fraudulent'],
labels={
"index": "Fraud or Not",
"fraudulent": "Fraudulent Count"},
width=600,height=500,color_continuous_scale="ylgn")
fig.show()
pct_job=fake_job_df['fraudulent'].value_counts(normalize=True).to_frame()
fig = px.pie(target_count, names=target_count.index, values=target_count['fraudulent'], title='% Fraud VS Real',
color=target_count['fraudulent'],
labels={
"index": "Fraud or Not",
"fraudulent": "Fraudulent Count"
},
width=600,height=500)
fig.show()
# check the columns
fake_job_df.columns
Index(['title', 'department', 'has_salary_range', 'company_profile',
'description', 'requirements', 'benefits', 'telecommuting',
'has_company_logo', 'has_questions', 'employment_type',
'required_experience', 'required_education', 'industry', 'function',
'fraudulent', 'country', 'province', 'city'],
dtype='object')
fake_job_df.sample()
| title | department | has_salary_range | company_profile | description | requirements | benefits | telecommuting | has_company_logo | has_questions | employment_type | required_experience | required_education | industry | function | fraudulent | country | province | city | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6672 | Shipper (Portland) | Operations | 0 | Our HistoryFounded in 1929 by Earl Bertrand Br... | Position available is located in our Portland,... | Essential Job FunctionsPicking;Packing;Stage o... | Blank | 0 | 1 | 1 | Blank | Blank | Blank | Blank | Other | 0 | US | OR | Portland |
company_profile,description,requirements,benefits are all text features, I will process further investigate in the NLP process.# Helper make table top 10 observation for the feature
'''
Approach 1, matplotlibs, not using for now
'''
def top10_count_table_maker(col,df):
df_out=df[col].value_counts().reset_index().iloc[:10]
df_out.columns=[col,'counts']
return df_out
# Helper function
# plot top 10 observation for the feature
def countplot_maker(col,df):
plt.figure(figsize=(12,6))
#x=df[df[col].values!="Blank"][col].values
sns.countplot(x=col, data=df[df[col]!='Blank'], hue="fraudulent", order=df[col].value_counts().iloc[:10].index)
plt.xticks(rotation=60,fontsize=12)
plt.xlabel(col,fontsize=15)
plt.ylabel("count",fontsize=15)
plt.title(f"top 10 {col} distribution",
fontsize = 20)
plt.show()
# Helper function
#Table target feature,fraudulent, counts and % of each propotions
def maker_pct(col1,col2,df):
df_out= pd.DataFrame()
df_out=df.groupby([col1,col2]).size().reset_index()
df_out['percentage']=fake_job_df.groupby([col1,col2]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
df_out.columns = [col1, col2,'Counts', 'Percentage']
# get rid of 'blank'
df_out=df_out[df_out[col1]!='Blank']
#print(df_out)
fig=px.bar(df_out, x=col1, y='Counts', color=col2, text=df_out['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)),
title=f"Distribution of {col1}",labels=[0,1])
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total ascending'})
fig.update_layout(showlegend=False,overwrite=True)
return fig.show()
# Helper function of make pct-components for each feature
def table_maker_pct(col,df):
pct_out=df[col].value_counts(normalize=True).reset_index().sort_values(col,ascending=False)
pct_out.rename(columns={'index':'Type'},inplace=True)
pct_out=pct_out[pct_out['Type']!='Blank']
return pct_out
# fake job pct by employee type
table_maker_pct('employment_type',fake_job_df)
| Type | employment_type | |
|---|---|---|
| 0 | Full-time | 0.649869 |
| 2 | Contract | 0.085240 |
| 3 | Part-time | 0.044577 |
| 4 | Temporary | 0.013480 |
| 5 | Other | 0.012696 |
# distribnution of
maker_pct('employment_type','fraudulent',fake_job_df)
# fake job pct by telecommuting or not
table_maker_pct('telecommuting',fake_job_df)
| Type | telecommuting | |
|---|---|---|
| 0 | 0 | 0.957101 |
| 1 | 1 | 0.042899 |
maker_pct('telecommuting','fraudulent',fake_job_df)
# fake job pct by has_company_logo or not
table_maker_pct('has_company_logo',fake_job_df)
| Type | has_company_logo | |
|---|---|---|
| 0 | 1 | 0.795291 |
| 1 | 0 | 0.204709 |
maker_pct('has_company_logo','fraudulent',fake_job_df)
# fake job pct by questions or not
table_maker_pct('has_questions',fake_job_df)
| Type | has_questions | |
|---|---|---|
| 0 | 0 | 0.50825 |
| 1 | 1 | 0.49175 |
maker_pct('has_questions','fraudulent',fake_job_df)
has_questions this is a nearly half-halg situation. From the investgate, I obseved fake posts have a slightly heigher quantity on including questions in the job post.# pct of fake jobs by different working experience
table_maker_pct('required_experience',fake_job_df)
| Type | required_experience | |
|---|---|---|
| 1 | Mid-Senior level | 0.213043 |
| 2 | Entry level | 0.150847 |
| 3 | Associate | 0.128475 |
| 4 | Not Applicable | 0.062364 |
| 5 | Director | 0.021757 |
| 6 | Internship | 0.021310 |
| 7 | Executive | 0.007886 |
maker_pct('required_experience','fraudulent',fake_job_df)
table_maker_pct('required_education',fake_job_df)
| Type | required_education | |
|---|---|---|
| 1 | Bachelor's Degree | 0.287768 |
| 2 | High School or equivalent | 0.116338 |
| 3 | Unspecified | 0.078136 |
| 4 | Master's Degree | 0.023268 |
| 5 | Associate Degree | 0.015325 |
| 6 | Certification | 0.009508 |
| 7 | Some College Coursework Completed | 0.005705 |
| 8 | Professional | 0.004139 |
| 9 | Vocational | 0.002741 |
| 10 | Some High School Coursework | 0.001510 |
| 11 | Doctorate | 0.001454 |
| 12 | Vocational - HS Diploma | 0.000503 |
| 13 | Vocational - Degree | 0.000336 |
maker_pct('required_education','fraudulent',fake_job_df)
de_out=top10_count_table_maker('department',fake_job_df)
de_out[de_out['department']!='Blank']
| department | counts | |
|---|---|---|
| 1 | Sales | 551 |
| 2 | Engineering | 487 |
| 3 | Marketing | 401 |
| 4 | Operations | 270 |
| 5 | IT | 225 |
| 6 | Development | 146 |
| 7 | Product | 112 |
| 8 | Information Technology | 86 |
| 9 | Technology | 76 |
# top 10 Department distribution (number count)
countplot_maker('department',fake_job_df[fake_job_df['department']!='Blank'])
# top 10 title
top10_count_table_maker('title',fake_job_df)
| title | counts | |
|---|---|---|
| 0 | English Teacher Abroad | 311 |
| 1 | Customer Service Associate | 146 |
| 2 | Graduates: English Teacher Abroad (Conversatio... | 144 |
| 3 | English Teacher Abroad | 95 |
| 4 | Software Engineer | 86 |
| 5 | English Teacher Abroad (Conversational) | 83 |
| 6 | Customer Service Associate - Part Time | 76 |
| 7 | Account Manager | 75 |
| 8 | Web Developer | 66 |
| 9 | Project Manager | 62 |
# pct of top 10 titles
table_maker_pct('title',fake_job_df)[:11]
| Type | title | |
|---|---|---|
| 0 | English Teacher Abroad | 0.017395 |
| 1 | Customer Service Associate | 0.008166 |
| 2 | Graduates: English Teacher Abroad (Conversatio... | 0.008054 |
| 3 | English Teacher Abroad | 0.005313 |
| 4 | Software Engineer | 0.004810 |
| 5 | English Teacher Abroad (Conversational) | 0.004642 |
| 6 | Customer Service Associate - Part Time | 0.004251 |
| 7 | Account Manager | 0.004195 |
| 8 | Web Developer | 0.003691 |
| 9 | Project Manager | 0.003468 |
| 10 | Beauty & Fragrance consultants needed | 0.003356 |
#plot top 10 title
countplot_maker('title',fake_job_df[fake_job_df['title']!='Blank'])
fraudulent# top 10 industry
in_out=top10_count_table_maker('industry',fake_job_df)
in_out[in_out['industry']!='Blank']
| industry | counts | |
|---|---|---|
| 1 | Information Technology and Services | 1734 |
| 2 | Computer Software | 1376 |
| 3 | Internet | 1062 |
| 4 | Marketing and Advertising | 827 |
| 5 | Education Management | 822 |
| 6 | Financial Services | 779 |
| 7 | Hospital & Health Care | 497 |
| 8 | Consumer Services | 358 |
| 9 | Telecommunications | 342 |
# top 10 industry plot
countplot_maker('industry',fake_job_df[fake_job_df['industry']!='Blank'])
# top 10 function
func_out=top10_count_table_maker('function',fake_job_df)
func_out[func_out['function']!='Blank']
| function | counts | |
|---|---|---|
| 1 | Information Technology | 1749 |
| 2 | Sales | 1468 |
| 3 | Engineering | 1348 |
| 4 | Customer Service | 1228 |
| 5 | Marketing | 830 |
| 6 | Administrative | 630 |
| 7 | Design | 340 |
| 8 | Health Care Provider | 338 |
| 9 | Other | 325 |
table_maker_pct('function',fake_job_df)[0:10]
| Type | function | |
|---|---|---|
| 1 | Information Technology | 0.097824 |
| 2 | Sales | 0.082108 |
| 3 | Engineering | 0.075396 |
| 4 | Customer Service | 0.068684 |
| 5 | Marketing | 0.046423 |
| 6 | Administrative | 0.035237 |
| 7 | Design | 0.019017 |
| 8 | Health Care Provider | 0.018905 |
| 10 | Education | 0.018178 |
| 9 | Other | 0.018178 |
# plot top 10 function
countplot_maker('function',fake_job_df[fake_job_df['function']!='Blank'])
# top 10 fake job city_wise
df_country_fake=fake_job_df[fake_job_df['fraudulent']==1].groupby('country').count()['title'].sort_values(ascending=False).reset_index()
df_country_fake.columns=['country','fake_counts']
df_country_fake[:10]
| country | fake_counts | |
|---|---|---|
| 0 | US | 730 |
| 1 | AU | 40 |
| 2 | GB | 23 |
| 3 | Blank | 19 |
| 4 | CA | 12 |
| 5 | MY | 12 |
| 6 | QA | 6 |
| 7 | BH | 5 |
| 8 | IN | 4 |
| 9 | PL | 3 |
# real job psots country-wise
df_country_real=fake_job_df[fake_job_df['fraudulent']==0].groupby('country').count()['title'].sort_values(ascending=False).reset_index()
df_country_real.columns=['country','real_counts']
df_country_real[:10]
| country | real_counts | |
|---|---|---|
| 0 | US | 9926 |
| 1 | GB | 2361 |
| 2 | GR | 940 |
| 3 | CA | 445 |
| 4 | DE | 383 |
| 5 | NZ | 332 |
| 6 | Blank | 327 |
| 7 | IN | 272 |
| 8 | AU | 174 |
| 9 | PH | 131 |
# total job post by country
df_country=fake_job_df.groupby('country').count()['function'].reset_index()
df_country.columns=['country',"job post count"]
df_country=df_country.sort_values(by='job post count',ascending=False).iloc[:10]
df_country.reset_index(drop=True,inplace=True)
df_country
| country | job post count | |
|---|---|---|
| 0 | US | 10656 |
| 1 | GB | 2384 |
| 2 | GR | 940 |
| 3 | CA | 457 |
| 4 | DE | 383 |
| 5 | Blank | 346 |
| 6 | NZ | 332 |
| 7 | IN | 276 |
| 8 | AU | 214 |
| 9 | PH | 132 |
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
ax1.set_title('Fake Post')
plt.bar(x=df_country_fake[:10]['country'],height=df_country_fake[:10]['fake_counts'],color='gold')
plt.title("Fake job by country",fontsize='14')
plt.xticks(rotation=45)
plt.subplot(1,2,2)
ax1.set_title('Real Post')
plt.bar(x=df_country_real[:10]['country'],height=df_country_real[:10]['real_counts'])
plt.title("Real job by country",fontsize='14')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(12,6))
plt.title("Total Job posting country_wise",fontsize='14')
plt.bar(x=df_country['country'],height=df_country['job post count'])
plt.xlabel('country')
plt.ylabel("counts")
plt.show()
# total job post by city
df_city=fake_job_df.groupby('city').count()['function'].reset_index()
df_city.columns=['city',"job post count"]
df_city=df_city.sort_values(by='job post count',ascending=False).iloc[:10]
df_city.reset_index(drop=True,inplace=True)
df_city=df_city[1:11]
df_city
| city | job post count | |
|---|---|---|
| 1 | London | 1060 |
| 2 | New York | 669 |
| 3 | Athens | 542 |
| 4 | San Francisco | 477 |
| 5 | Houston | 271 |
| 6 | Berlin | 262 |
| 7 | Chicago | 257 |
| 8 | Washington | 255 |
| 9 | Auckland | 226 |
# top 10 fake job city_wise
df_city_fake=fake_job_df[fake_job_df['fraudulent']==1].groupby('city').count()['title'].sort_values(ascending=False).reset_index()
df_city_fake.columns=['city','fake_counts']
df_city_fake=df_city_fake[1:11]
df_city_fake
| city | fake_counts | |
|---|---|---|
| 1 | Houston | 92 |
| 2 | Sydney | 31 |
| 3 | Bakersfield | 24 |
| 4 | Los Angeles | 23 |
| 5 | San Mateo | 22 |
| 6 | New York | 20 |
| 7 | San Jose | 16 |
| 8 | AUSTIN | 15 |
| 9 | DALLAS | 12 |
| 10 | San Francisco | 11 |
# top 10 real job city_wise
df_city_real=fake_job_df[fake_job_df['fraudulent']==0].groupby('city').count()['title'].sort_values(ascending=False).reset_index()
df_city_real.columns=['city','real_counts']
df_city_real=df_city_real[1:11]
df_city_real
| city | real_counts | |
|---|---|---|
| 1 | London | 1056 |
| 2 | New York | 649 |
| 3 | Athens | 542 |
| 4 | San Francisco | 466 |
| 5 | Berlin | 262 |
| 6 | Washington | 253 |
| 7 | Chicago | 253 |
| 8 | Auckland | 226 |
| 9 | Houston | 179 |
| 10 | Austin | 171 |
# fake real job city_wise
plt.figure(figsize=(20,6))
plt.subplot(1,2,1)
ax1.set_title('Fake Post')
plt.bar(x=df_city_fake[:10]['city'],height=df_city_fake[:10]['counts'],color='gold')
plt.title("Fake job by city",fontsize='14')
plt.xticks(rotation=45)
plt.subplot(1,2,2)
ax1.set_title('Real Post')
plt.bar(x=df_city_real[:10]['city'],height=df_city_real[:10]['counts'])
plt.title("Real job by city",fontsize='14')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(12,6))
plt.title("city-wise Job posting",fontsize='14')
plt.bar(x=df_city['city'],height=df_city['job post count'])
plt.xlabel('city')
plt.ylabel("counts")
plt.show()
# function of rerturn length of target features
def str_len_count(col,df):
fig,(ax1,ax2)= plt.subplots(ncols=2, figsize=(17, 5), dpi=100)
# get length
length=df[df["fraudulent"]==1][col].str.len()
ax1.hist(length,bins = 20,color='gold')
ax1.set_title('Fake Post')
length=fake_job_df[fake_job_df["fraudulent"]==0][col].str.len()
ax2.hist(length, bins = 20)
ax2.set_title('Real Post')
fig.suptitle(f'Characters in {col}')
plt.show()
# n=0,1 1=fake 0=real
# function of rerturn tartget features words count mean and median
def avg_table_maker(col,df,n):
if n==1:
out_df=df[df["fraudulent"]==1][col].str.len().agg(['mean','median']).to_frame()
out_df.columns=[f'Fake job {col}']
return out_df
if n==0:
out_df=df[df["fraudulent"]==0][col].str.len().agg(['mean','median']).to_frame()
out_df.columns=[f'real job {col}']
return out_df
else:
print("n only = 0 or 1")
avg_table_maker('description',fake_job_df,0)
| real job description | |
|---|---|
| mean | 1221.1694 |
| median | 1027.0000 |
avg_table_maker('description',fake_job_df,1)
| Fake job description | |
|---|---|
| mean | 1154.840647 |
| median | 844.500000 |
str_len_count('description',fake_job_df)
avg_table_maker('company_profile',fake_job_df,1)
| Fake job company_profile | |
|---|---|
| mean | 234.2806 |
| median | 5.0000 |
avg_table_maker('company_profile',fake_job_df,0)
| real job company_profile | |
|---|---|
| mean | 641.51443 |
| median | 588.00000 |
str_len_count('company_profile',fake_job_df)
avg_table_maker('requirements',fake_job_df,1)
| Fake job requirements | |
|---|---|
| mean | 446.938799 |
| median | 249.000000 |
avg_table_maker('requirements',fake_job_df,0)
| real job requirements | |
|---|---|
| mean | 598.085993 |
| median | 476.000000 |
str_len_count('requirements',fake_job_df)
avg_table_maker('benefits',fake_job_df,1)
| Fake job benefits | |
|---|---|
| mean | 214.297921 |
| median | 36.000000 |
avg_table_maker('benefits',fake_job_df,0)
| real job benefits | |
|---|---|
| mean | 210.658614 |
| median | 47.000000 |
str_len_count('benefits',fake_job_df)